Importing Libraries used for this analysis

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(rlang)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::%@%()         masks rlang::%@%()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ purrr::flatten()     masks rlang::flatten()
## ✖ purrr::flatten_chr() masks rlang::flatten_chr()
## ✖ purrr::flatten_dbl() masks rlang::flatten_dbl()
## ✖ purrr::flatten_int() masks rlang::flatten_int()
## ✖ purrr::flatten_lgl() masks rlang::flatten_lgl()
## ✖ purrr::flatten_raw() masks rlang::flatten_raw()
## ✖ purrr::invoke()      masks rlang::invoke()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ purrr::splice()      masks rlang::splice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(corrplot)
## corrplot 0.92 loaded

loading the Dataset

Cleaned_bitcoin_mining <- read.csv("Cleaned_bitcoin_mining.csv")

head(Cleaned_bitcoin_mining)
##         Date.and.Time power.MAX..GW power.MIN..GW power.GUESS..GW
## 1 2010-07-18T00:00:00      2.67e-05      2.24e-05        2.44e-05
## 2 2010-07-19T00:00:00      2.68e-05      2.26e-05        2.46e-05
## 3 2010-07-20T00:00:00      2.72e-05      2.29e-05        2.50e-05
## 4 2010-07-21T00:00:00      2.84e-05      2.39e-05        2.61e-05
## 5 2010-07-22T00:00:00      2.82e-05      2.37e-05        2.59e-05
## 6 2010-07-23T00:00:00      2.85e-05      2.40e-05        2.61e-05
##   annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
## 1                     0.000233717                     0.000196712
## 2                     0.000235075                     0.000197855
## 3                     0.000238699                     0.000200905
## 4                     0.000249343                     0.000209864
## 5                     0.000247305                     0.000208148
## 6                     0.000250023                     0.000210436
##   annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
## 1                       0.000214241                     14313700
## 2                       0.000215486                     14313700
## 3                       0.000218808                     14313700
## 4                       0.000228565                     14313700
## 5                       0.000226696                     14313700
## 6                       0.000229188                     14313700
##   Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
## 1                   14313700                     14313700              4e-06
## 2                   14313700                     14313700              5e-06
## 3                   14313700                     14313700              5e-06
## 4                   14313700                     14313700              5e-06
## 5                   14313700                     14313700              5e-06
## 6                   14313700                     14313700              5e-06
##   Estimated..MtCO2e Coal.only..MtCO2e Emission.intensity..gCO2e.kWh
## 1          0.000119          0.000214                      554.1215
## 2          0.000119          0.000216                      554.1215
## 3          0.000121          0.000219                      554.1215
## 4          0.000127          0.000229                      554.1215
## 5          0.000126          0.000227                      554.1215
## 6          0.000127          0.000229                      554.1215
##   Hash.rate.MH.s
## 1    0.001606373
## 2    0.001822962
## 3    0.001822962
## 4    0.001750766
## 5    0.001669545
## 6    0.001669545

Checking the dimension and Structure of data

dim(Cleaned_bitcoin_mining)
## [1] 4815   15
str(Cleaned_bitcoin_mining)
## 'data.frame':    4815 obs. of  15 variables:
##  $ Date.and.Time                    : chr  "2010-07-18T00:00:00" "2010-07-19T00:00:00" "2010-07-20T00:00:00" "2010-07-21T00:00:00" ...
##  $ power.MAX..GW                    : num  2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
##  $ power.MIN..GW                    : num  2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
##  $ power.GUESS..GW                  : num  2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
##  $ annualised.consumption.MAX..TWh  : num  0.000234 0.000235 0.000239 0.000249 0.000247 ...
##  $ annualised.consumption.MIN..TWh  : num  0.000197 0.000198 0.000201 0.00021 0.000208 ...
##  $ annualised.consumption.GUESS..TWh: num  0.000214 0.000215 0.000219 0.000229 0.000227 ...
##  $ Lower.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Estimated.efficiency..J.Th       : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Upper.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Hydro.only..MtCO2e               : num  4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
##  $ Estimated..MtCO2e                : num  0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
##  $ Coal.only..MtCO2e                : num  0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
##  $ Emission.intensity..gCO2e.kWh    : num  554 554 554 554 554 ...
##  $ Hash.rate.MH.s                   : num  0.00161 0.00182 0.00182 0.00175 0.00167 ...

Summary Statistics

summary(Cleaned_bitcoin_mining)
##  Date.and.Time      power.MAX..GW      power.MIN..GW      power.GUESS..GW    
##  Length:4815        Min.   : 0.00003   Min.   :0.000022   Min.   : 0.000024  
##  Class :character   1st Qu.: 0.39179   1st Qu.:0.031152   1st Qu.: 0.154086  
##  Mode  :character   Median : 2.12457   Median :0.384142   Median : 0.905217  
##                     Mean   : 9.82974   Mean   :2.039373   Mean   : 3.989582  
##                     3rd Qu.:15.41883   3rd Qu.:4.049493   3rd Qu.: 7.710647  
##                     Max.   :56.01570   Max.   :8.947454   Max.   :15.063222  
##  annualised.consumption.MAX..TWh annualised.consumption.MIN..TWh
##  Min.   :  0.0002                Min.   : 0.0002                
##  1st Qu.:  3.4344                1st Qu.: 0.2731                
##  Median : 18.6240                Median : 3.3674                
##  Mean   : 86.1675                Mean   :17.8771                
##  3rd Qu.:135.1615                3rd Qu.:35.4978                
##  Max.   :491.0337                Max.   :78.4334                
##  annualised.consumption.GUESS..TWh Lower.bound.efficiency..J.Th
##  Min.   :  0.00021                 Min.   :      21            
##  1st Qu.:  1.35072                 1st Qu.:      38            
##  Median :  7.93513                 Median :      98            
##  Mean   : 34.97267                 Mean   :  458086            
##  3rd Qu.: 67.59153                 3rd Qu.:    9917            
##  Max.   :132.04420                 Max.   :14313700            
##  Estimated.efficiency..J.Th Upper.bound.efficiency..J.Th Hydro.only..MtCO2e
##  Min.   :      31           Min.   :      46             Min.   :0.000004  
##  1st Qu.:      68           1st Qu.:     167             1st Qu.:0.028365  
##  Median :     261           Median :     766             Median :0.166638  
##  Mean   :  771891           Mean   : 1292594             Mean   :0.734426  
##  3rd Qu.:   36553           3rd Qu.:   75000             3rd Qu.:1.419422  
##  Max.   :14313700           Max.   :14313700             Max.   :2.772928  
##  Estimated..MtCO2e  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh
##  Min.   : 0.00012   Min.   :  0.00021   Min.   :359.5                
##  1st Qu.: 0.75628   1st Qu.:  1.35207   1st Qu.:512.8                
##  Median : 4.22858   Median :  7.94307   Median :533.7                
##  Mean   :17.95686   Mean   : 35.00765   Mean   :532.2                
##  3rd Qu.:31.96006   3rd Qu.: 67.65912   3rd Qu.:559.0                
##  Max.   :66.90830   Max.   :132.17625   Max.   :594.6                
##  Hash.rate.MH.s     
##  Min.   :        0  
##  1st Qu.:     3838  
##  Median :  3210303  
##  Mean   : 64397862  
##  3rd Qu.:111495251  
##  Max.   :506061817

From the summary Statistics, we can sense the distribution, central tendency and range of each variable, as well as the presence of missing values.

Data cleaning

Checking for missing values

sum(is.na(Cleaned_bitcoin_mining))
## [1] 0

There are No missing values as this is the Cleaned dataset and Every column has complete data for all the rows.

Checking number of Unique values

sapply(Cleaned_bitcoin_mining, function(x) length(unique(x)))
##                     Date.and.Time                     power.MAX..GW 
##                              4815                              4767 
##                     power.MIN..GW                   power.GUESS..GW 
##                              4745                              4771 
##   annualised.consumption.MAX..TWh   annualised.consumption.MIN..TWh 
##                              4771                              4750 
## annualised.consumption.GUESS..TWh      Lower.bound.efficiency..J.Th 
##                              4774                                24 
##        Estimated.efficiency..J.Th      Upper.bound.efficiency..J.Th 
##                               275                                44 
##                Hydro.only..MtCO2e                 Estimated..MtCO2e 
##                              4543                              4757 
##                 Coal.only..MtCO2e     Emission.intensity..gCO2e.kWh 
##                              4761                                39 
##                    Hash.rate.MH.s 
##                              3801

Date and time has 4815 unique values which means that each row corresponds to a unique timestamp. Most of the columns have a large number of unique values, suggesting continous data, but few columns like ” lower Bound eficiency, J/th”, “Upper bound efficiency, J/th”, and “Emission intensity, gCO2e/kWh” have fewer values, indicating potential categories or repeated measurements.

Changing of “data and time” datatype to datetime format

Cleaned_bitcoin_mining$'Date.and.Time' <- as.POSIXct(Cleaned_bitcoin_mining$'Date.and.Time',format= "%Y-%m-%dT%H:%M:%S")

 str(Cleaned_bitcoin_mining)
## 'data.frame':    4815 obs. of  15 variables:
##  $ Date.and.Time                    : POSIXct, format: "2010-07-18" "2010-07-19" ...
##  $ power.MAX..GW                    : num  2.67e-05 2.68e-05 2.72e-05 2.84e-05 2.82e-05 2.85e-05 2.86e-05 2.99e-05 3.15e-05 3.23e-05 ...
##  $ power.MIN..GW                    : num  2.24e-05 2.26e-05 2.29e-05 2.39e-05 2.37e-05 2.40e-05 2.41e-05 2.52e-05 2.65e-05 2.72e-05 ...
##  $ power.GUESS..GW                  : num  2.44e-05 2.46e-05 2.50e-05 2.61e-05 2.59e-05 2.61e-05 2.62e-05 2.74e-05 2.88e-05 2.96e-05 ...
##  $ annualised.consumption.MAX..TWh  : num  0.000234 0.000235 0.000239 0.000249 0.000247 ...
##  $ annualised.consumption.MIN..TWh  : num  0.000197 0.000198 0.000201 0.00021 0.000208 ...
##  $ annualised.consumption.GUESS..TWh: num  0.000214 0.000215 0.000219 0.000229 0.000227 ...
##  $ Lower.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Estimated.efficiency..J.Th       : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Upper.bound.efficiency..J.Th     : num  14313700 14313700 14313700 14313700 14313700 ...
##  $ Hydro.only..MtCO2e               : num  4e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 5e-06 ...
##  $ Estimated..MtCO2e                : num  0.000119 0.000119 0.000121 0.000127 0.000126 0.000127 0.000127 0.000133 0.00014 0.000144 ...
##  $ Coal.only..MtCO2e                : num  0.000214 0.000216 0.000219 0.000229 0.000227 0.000229 0.00023 0.000241 0.000253 0.00026 ...
##  $ Emission.intensity..gCO2e.kWh    : num  554 554 554 554 554 ...
##  $ Hash.rate.MH.s                   : num  0.00161 0.00182 0.00182 0.00175 0.00167 ...
 class(Cleaned_bitcoin_mining$Date.and.Time)
## [1] "POSIXct" "POSIXt"
 date_range <- range(Cleaned_bitcoin_mining$Date.and.Time)
 
 date_range
## [1] "2010-07-18 EDT" "2023-09-22 EDT"

we are changing the data and time’s datatype to POSIXct as many plotting functions understand ’POSIXct/ POSIXit and will correctly format axes and labels when ploting datetime values, and is better for data manipulations and operations.

Univariate Analysis - Analyzing one variable at a time

Histograms- Histograms will give insights into the distribution of continuous variables and helps us to understand the central the central tendency, spread, and shape of the dataset’s distribution

# there are the selected variables for analysis
variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th', 
               'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e', 
               'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')



# Improving the variable names for display in the plots
var_names <- c('Power (GW)', 'Annualised Consumption (TWh)', 'Estimated Efficiency (J/Th)', 
               'Hydro Only Emissions (MtCO2e)', 'Estimated Emissions (MtCO2e)', 'Coal Only Emissions (MtCO2e)', 
               'Emission Intensity (gCO2e/kWh)', 'Hash Rate (MH/s)')

# Convert data to long format for facetting
df_long <- Cleaned_bitcoin_mining %>%
  select(all_of(variables)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

df_long$Variable <- factor(df_long$Variable, levels = variables, labels = var_names)

# All histogram in one view
p <- ggplot(df_long, aes(x = Value)) + 
  geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
  geom_freqpoly(color = "#e34a33", size = 1) +
  facet_wrap(~ Variable, scales = "free", ncol = 2) +
  theme_minimal() + 
  labs(title = "Histograms of Selected Variables", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# printing each histogram for better view
for(i in 1:length(variables)) {
    df_subset <- df_long[df_long$Variable == var_names[i], ]
    
    p <- ggplot(df_subset, aes(x = Value)) + 
      geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
      geom_freqpoly(color = "#e34a33", size = 1) +
      labs(title = paste("Histogram of", var_names[i]), y = "Frequency") +
      theme_minimal() + 
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    
    print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outliers

Boxplots- Boxplots are useful to visualize outliers and helpful to understand the spread and skewness of the data and they also show the median, quartiles, and potential outliers for each variable.

for(i in 1:length(variables)) {
  p <- ggplot(Cleaned_bitcoin_mining, aes(y = Cleaned_bitcoin_mining[[variables[i]]])) + 
    geom_boxplot(fill = '#66c2a5', color = '#004d40', outlier.color = "red", outlier.size = 2) +
    labs(title = paste("Box Plot of", var_names[i]), y = var_names[i]) +
    theme_minimal() 
  
  print(p)
}
## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

## Warning: Use of `Cleaned_bitcoin_mining[[variables[i]]]` is discouraged.
## ℹ Use `.data[[variables[i]]]` instead.

IQR

variables <- c('power.GUESS..GW', 'annualised.consumption.GUESS..TWh', 'Estimated.efficiency..J.Th', 
               'Hydro.only..MtCO2e', 'Estimated..MtCO2e', 'Coal.only..MtCO2e', 
               'Emission.intensity..gCO2e.kWh', 'Hash.rate.MH.s')


# sapply function is used to apply a finction to each variable in the 'variables'

outliers_counts <- sapply(variables, function(var) {

  Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- Cleaned_bitcoin_mining[[var]][Cleaned_bitcoin_mining[[var]] < lower_bound | 
                                           Cleaned_bitcoin_mining[[var]] > upper_bound]
  
  length(outliers)
})

names(outliers_counts) <- variables

outliers_counts
##                   power.GUESS..GW annualised.consumption.GUESS..TWh 
##                                 0                                 0 
##        Estimated.efficiency..J.Th                Hydro.only..MtCO2e 
##                              1097                                 0 
##                 Estimated..MtCO2e                 Coal.only..MtCO2e 
##                                 0                                 0 
##     Emission.intensity..gCO2e.kWh                    Hash.rate.MH.s 
##                               214                               254

Bitcoin’s popularity, mining difficulty, and technology have evolved over time. Extreme values in recent years might reflect genuine shifts in the ecosystem and whereas early outliers might indicate data sparsity or other anomalies.

Cap/Floor Outliers- Instead of removing the outliers, we can cap them.

If we feel like the extreme values are genuine or not errors which influences the analysis, we acn consider capping them at a threshold like the lower and upper bound determined by the IQR method as this retains the data but reduces the skewness.

For example, any value below the lower bound can be set to the lower bound value and similar for the upper bound and this approach retains the outliers.

Cleaned_bitcoin_mining_copy <- Cleaned_bitcoin_mining

for(var in variables) {
  
  Q1 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining_copy[[var]], 0.75)
  IQR <- Q3 - Q1
  

  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  Cleaned_bitcoin_mining_copy[[var]] <- ifelse(Cleaned_bitcoin_mining_copy[[var]] < lower_bound, lower_bound, 
                                               ifelse(Cleaned_bitcoin_mining_copy[[var]] > upper_bound, upper_bound, 
                                                      Cleaned_bitcoin_mining_copy[[var]]))
}

summary(Cleaned_bitcoin_mining_copy[variables])
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000024   Min.   :  0.00021                
##  1st Qu.: 0.154086   1st Qu.:  1.35072                
##  Median : 0.905217   Median :  7.93513                
##  Mean   : 3.989582   Mean   : 34.97267                
##  3rd Qu.: 7.710647   3rd Qu.: 67.59153                
##  Max.   :15.063222   Max.   :132.04420                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :   31.13           Min.   :0.000004   Min.   : 0.00012  
##  1st Qu.:   67.72           1st Qu.:0.028365   1st Qu.: 0.75628  
##  Median :  260.92           Median :0.166638   Median : 4.22858  
##  Mean   :23180.17           Mean   :0.734426   Mean   :17.95686  
##  3rd Qu.:36553.00           3rd Qu.:1.419422   3rd Qu.:31.96006  
##  Max.   :91280.91           Max.   :2.772928   Max.   :66.90830  
##  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.00021   Min.   :443.5                 Min.   :        0  
##  1st Qu.:  1.35207   1st Qu.:512.8                 1st Qu.:     3838  
##  Median :  7.94307   Median :533.7                 Median :  3210303  
##  Mean   : 35.00765   Mean   :534.2                 Mean   : 60413666  
##  3rd Qu.: 67.65912   3rd Qu.:559.0                 3rd Qu.:111495251  
##  Max.   :132.17625   Max.   :594.6                 Max.   :278732371
for (var in variables) {
  
  p <- ggplot(Cleaned_bitcoin_mining_copy, aes_string(x = var)) + 
    geom_histogram(aes(y = ..count..),fill = '#66c2a5', color = '#004d40', bins = 30) +
    geom_freqpoly(color = "#e34a33", size = 1) +
    labs(title = paste("Histogram of", var, "after Capping/Flooring"), y = "Frequency") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

After capping and flooring, the extreme values in the data were limited to a more standadized range. The capped/floored data likely still retains its right- skewed nature for many variables.

The spread of data might appear more compact now without the long tails that were previously present due to outliers.

Apply log transformation - Log transformation is a dta transformation method in which it replaces each variable x with a log(x).

If the data is heavly skewed, by applying the log transformation we can make the data more interpretable and it is especially useful when there are extreme values or outliers.

For right- skewed data, if we use log transformation, we can compress the long tail and make the distribution more symmetrical.

It has the effect of compressing the higher values more than the lower values, which can be particularly useful for right skewed data.

After the log-transformation, we expect the peaks of these polygons to shift towards the center, indicating a more normalised distribution. Outliers will be closer to the main data cluster, making them less extreme
Cleaned_bitcoin_mining_log <- Cleaned_bitcoin_mining
for (var in variables) {
  Cleaned_bitcoin_mining_log[[paste0("log_", var)]] <- log1p(Cleaned_bitcoin_mining[[var]])
}

# Visualize the log-transformed data
for (var in paste0("log_", variables)) {
  # Plot
  p <- ggplot(Cleaned_bitcoin_mining_log, aes_string(x = var)) + 
    geom_histogram(aes(y = ..count..), fill = '#66c2a5', color = '#004d40', bins = 30) +
    geom_freqpoly(color = "#e34a33", size = 1) +
    labs(title = paste("Histogram of Log Transformed", var), y = "Frequency") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

log1p function is used for tthe transformation because it computes the natural algorithm of 1 + x. It is useful for cases for cases where values might be zero as it ensures the transformed value remains defined.

It can help stabilize the variance making the data more normal- distribution and reduce the influence of outliers, especially for right skewed data.

Segmaentation Analysis- It’s a method used to divide a data set into subsets (with outliers(original data) & without outliers)

Instead of removing the outliers, we can perform segemented analysis, one with the entire dataset and one without outliers.

data_without_outliers <- Cleaned_bitcoin_mining

for (var in variables) {
  
  Q1 <- quantile(Cleaned_bitcoin_mining[[var]], 0.25)
  Q3 <- quantile(Cleaned_bitcoin_mining[[var]], 0.75)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  data_without_outliers <- data_without_outliers[data_without_outliers[[var]] >= lower_bound & data_without_outliers[[var]] <= upper_bound, ]
}

data_with_outliers <- Cleaned_bitcoin_mining

summary_without_outliers <- summary(data_without_outliers[variables])
summary_with_outliers <- summary(data_with_outliers[variables])

list(Without_Outliers = summary_without_outliers, With_Outliers = summary_with_outliers)
## $Without_Outliers
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000478   Min.   :  0.00419                
##  1st Qu.: 0.539391   1st Qu.:  4.72830                
##  Median : 3.611244   Median : 31.65617                
##  Mean   : 4.398428   Mean   : 38.55662                
##  3rd Qu.: 8.442700   3rd Qu.: 74.00870                
##  Max.   :13.266792   Max.   :116.29670                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :   33.43           Min.   :0.000088   Min.   : 0.00236  
##  1st Qu.:   68.34           1st Qu.:0.099294   1st Qu.: 2.57071  
##  Median :  182.88           Median :0.664779   Median :16.67565  
##  Mean   : 3524.93           Mean   :0.809689   Mean   :20.32547  
##  3rd Qu.:  850.32           3rd Qu.:1.554183   3rd Qu.:38.74943  
##  Max.   :58750.00           Max.   :2.442231   Max.   :64.73054  
##  Coal.only..MtCO2e  Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.0042   Min.   :462.5                 Min.   :        7  
##  1st Qu.:  4.7330   1st Qu.:512.9                 1st Qu.:   450189  
##  Median : 31.6878   Median :533.7                 Median : 15023580  
##  Mean   : 38.5952   Mean   :533.9                 Mean   : 60112171  
##  3rd Qu.: 74.0827   3rd Qu.:554.5                 3rd Qu.:112650400  
##  Max.   :116.4130   Max.   :594.6                 Max.   :277924882  
## 
## $With_Outliers
##  power.GUESS..GW     annualised.consumption.GUESS..TWh
##  Min.   : 0.000024   Min.   :  0.00021                
##  1st Qu.: 0.154086   1st Qu.:  1.35072                
##  Median : 0.905217   Median :  7.93513                
##  Mean   : 3.989582   Mean   : 34.97267                
##  3rd Qu.: 7.710647   3rd Qu.: 67.59153                
##  Max.   :15.063222   Max.   :132.04420                
##  Estimated.efficiency..J.Th Hydro.only..MtCO2e Estimated..MtCO2e 
##  Min.   :      31           Min.   :0.000004   Min.   : 0.00012  
##  1st Qu.:      68           1st Qu.:0.028365   1st Qu.: 0.75628  
##  Median :     261           Median :0.166638   Median : 4.22858  
##  Mean   :  771891           Mean   :0.734426   Mean   :17.95686  
##  3rd Qu.:   36553           3rd Qu.:1.419422   3rd Qu.:31.96006  
##  Max.   :14313700           Max.   :2.772928   Max.   :66.90830  
##  Coal.only..MtCO2e   Emission.intensity..gCO2e.kWh Hash.rate.MH.s     
##  Min.   :  0.00021   Min.   :359.5                 Min.   :        0  
##  1st Qu.:  1.35207   1st Qu.:512.8                 1st Qu.:     3838  
##  Median :  7.94307   Median :533.7                 Median :  3210303  
##  Mean   : 35.00765   Mean   :532.2                 Mean   : 64397862  
##  3rd Qu.: 67.65912   3rd Qu.:559.0                 3rd Qu.:111495251  
##  Max.   :132.17625   Max.   :594.6                 Max.   :506061817

Bi-variate Analysis- Analzing the relationship between two or more variables.

Corelation Matrix

cor_matrix <- cor(Cleaned_bitcoin_mining[variables], use = "complete.obs")


col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(cor_matrix, method = "color", type = "upper", 
         col = col(200),     
         tl.col = "black",  
         tl.srt = 90,         
         order = "hclust",    
         addCoef.col = "black", 
         number.cex = 0.5,  
         title = "Correlation Matrix", mar=c(0,0,1,0))

Highly Correlated Variables :

Sample T-test to compare the Power.Guess..GW before and after jan 1st 2013

The T-test is used to deterrmine if there is a statististically significant difference between the means of two groups.

before_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time < as.Date("2013-01-03"))
after_2013 <- subset(Cleaned_bitcoin_mining, Date.and.Time >= as.Date("2013-01-03"))

t_result <- t.test(before_2013$power.GUESS..GW, after_2013$power.GUESS..GW)

print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  before_2013$power.GUESS..GW and after_2013$power.GUESS..GW
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.036382 -4.749997
## sample estimates:
##  mean of x  mean of y 
## 0.01100724 4.90419668

T-test for Selected Variables

results <- list()

for(var in variables) {

    if(any(is.na(before_2013[[var]])) || any(is.na(after_2013[[var]]))) {
        results[[var]] <- "Contains NA values"
    } else if(length(unique(before_2013[[var]])) == 1 || length(unique(after_2013[[var]])) == 1) {

        results[[var]] <- "Constant values in one or both periods"
    } else {
        result <- t.test(before_2013[[var]], after_2013[[var]])
        results[[var]] <- result
    }
}

for(var in variables) {
    cat("T-test results for", var, ":\n")
    print(results[[var]])
    cat("\n---------------------------------------------\n")
}
## T-test results for power.GUESS..GW :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.036382 -4.749997
## sample estimates:
##  mean of x  mean of y 
## 0.01100724 4.90419668 
## 
## 
## ---------------------------------------------
## T-test results for annualised.consumption.GUESS..TWh :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -44.14892 -41.63848
## sample estimates:
##   mean of x   mean of y 
##  0.09648949 42.99018811 
## 
## 
## ---------------------------------------------
## T-test results for Estimated.efficiency..J.Th :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = 25.546, df = 899.31, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3659260 4268296
## sample estimates:
##  mean of x  mean of y 
## 3994776.19   30998.28 
## 
## 
## ---------------------------------------------
## T-test results for Hydro.only..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9271274 -0.8744080
## sample estimates:
##   mean of x   mean of y 
## 0.002026276 0.902793953 
## 
## 
## ---------------------------------------------
## T-test results for Estimated..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.447, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22.66799 -21.36865
## sample estimates:
##   mean of x   mean of y 
##  0.05411622 22.07243658 
## 
## 
## ---------------------------------------------
## T-test results for Coal.only..MtCO2e :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -66.997, df = 3914.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -44.19307 -41.68011
## sample estimates:
##   mean of x   mean of y 
##  0.09658598 43.03317831 
## 
## 
## ---------------------------------------------
## T-test results for Emission.intensity..gCO2e.kWh :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = 53.295, df = 4099.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32.17905 34.63700
## sample estimates:
## mean of x mean of y 
##  559.4096  526.0015 
## 
## 
## ---------------------------------------------
## T-test results for Hash.rate.MH.s :
## 
##  Welch Two Sample t-test
## 
## data:  before_2013[[var]] and after_2013[[var]]
## t = -47.902, df = 3914, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -82443618 -75960296
## sample estimates:
##    mean of x    mean of y 
## 9.051497e+00 7.920197e+07 
## 
## 
## ---------------------------------------------